High profile repos contain more code and a higher percentage of it is statically typed.
ggplot(repo_data_all %>%
filter(total_bytes_no_data_type_system_static > 0 & total_bytes_no_data_type_system_static > 0),
aes(y = log10(total_bytes_no_data_type_system_static),
x = log10(total_file_size_no_data),
col = is_high_profile)) +
geom_point(size = 1) +
geom_point(data = subset(repo_data_all, is_high_profile), size = 1) +
ylab("Total bytes of statically typed code (log10)") +
xlab("Total bytes of code (log10)") +
geom_abline(slope = 1, intercept = 0) +
scale_color_manual(values=c(color_main, color_high_prof), labels = c("Main repos", "High profile repos")) +
theme_bw() +
guides(color = guide_legend(title = "Dataset")) +
theme(legend.text = element_text(size=12),
legend.title = element_text(size = 14),
axis.text = element_text(size = 12)) +
xlim(0, 8) +
ylim(0, 8)
## Warning: Removed 1 rows containing missing values (geom_point).
High profile repos have more developers and more outside developers.
plt_data_committers <- repo_data_all %>%
select(num_non_committing_authors, commit_authors, is_high_profile) %>%
group_by(num_non_committing_authors, commit_authors, is_high_profile) %>%
dplyr::summarize(Num_repos = n())
ggplot(plt_data_committers,
aes(y = num_non_committing_authors,
x = commit_authors,
col = is_high_profile)) +
geom_point(aes(size = Num_repos)) +
geom_point(data = subset(plt_data_committers, is_high_profile), aes(size = Num_repos)) +
ylab("Number of non-committing commit authors") +
xlab("Number of commit authors") +
scale_color_manual(values=c(color_main, color_high_prof), labels = c("Main repos", "High profile repos")) +
theme_bw() +
guides(color = guide_legend(title = "Dataset")) +
theme(legend.text = element_text(size=12),
legend.title = element_text(size = 14),
axis.text = element_text(size = 12)) +
#xlim(0, max(log10(repo_data_all$commit_authors), na.rm = T)) +
ylim(0, max(repo_data_all$num_non_committing_authors, na.rm = T)) +
scale_x_log10() +
stat_function(fun = function(x) {x}, geom="line", color = "black")
## Warning: Removed 1 rows containing missing values (geom_point).
## Warning: Removed 39 rows containing missing values (geom_path).
High profile repos have more source files but similar file sizes
plt_data_num_files <- repo_data_all %>% filter(num_files_no_data > 0 & mean_lines_code_no_data > 0)
breaks <- c(1, 10, 100, 1000, 10000)
ggplot(plt_data_num_files,
aes(x = num_files_no_data,
y = mean_lines_code_no_data,
col = is_high_profile)) +
geom_point(size = 1) +
geom_point(data = subset(plt_data_num_files, is_high_profile), size = 1) +
xlab("Total source files") +
ylab("Mean lines of code per source file") +
scale_color_manual(values=c(color_main, color_high_prof), labels = c("Main repos", "High profile repos")) +
theme_bw() +
guides(color = guide_legend(title = "Dataset")) +
theme(legend.text = element_text(size=12),
legend.title = element_text(size = 14),
axis.text = element_text(size = 12)) +
scale_x_log10(breaks = breaks) +
scale_y_log10(breaks = breaks)
Some paper topics are associated with higher community engagement, repo size, and commit activity
as.tbl(repo_data_main) %>%
select(commits, commit_authors, forks_count, subscribers_count,
watchers_count, num_citations_per_week_pmc_minus_2_years,
total_file_size_no_data, num_files_no_data, contains("topic")) %>%
melt(id.vars = c("commits", "commit_authors", "forks_count",
"subscribers_count", "watchers_count", "num_citations_per_week_pmc_minus_2_years",
"total_file_size_no_data", "num_files_no_data")) %>%
filter(value) %>%
as.tbl() %>%
select(-value) %>%
dplyr::rename(Topic = variable) %>%
group_by(Topic) %>%
dplyr::summarize(
`Mean commits` = mean(commits, na.rm = T),
`Mean commit authors` = mean(commit_authors, na.rm = T),
`Mean forks` = mean(forks_count, na.rm = T),
`Mean subscribers` = mean(subscribers_count, na.rm = T),
`Mean watchers` = mean(watchers_count, na.rm = T),
`Mean PMC citations / week` = mean(num_citations_per_week_pmc_minus_2_years, na.rm = T),
`Mean megabytes of code` = mean(total_file_size_no_data, na.rm = T) / 1000000,
`Mean number of files` = mean(num_files_no_data, na.rm = T)
) %>%
melt(id.vars = "Topic") %>%
mutate(Topic = gsub("topic_", "", Topic)) %>%
mutate(Topic = gsub("_", " ", Topic)) %>%
mutate(Topic = gsub("RNA.seq", "RNA-seq", Topic)) %>%
ggplot(aes(x = variable, y = value, fill = factor(Topic))) +
geom_bar(stat = "identity", position = "dodge") +
theme_bw() +
guides(fill = guide_legend(title="Abstract includes topic")) +
theme(legend.text = element_text(size=10),
legend.title = element_text(size = 11),
axis.text.y = element_text(size = 10),
axis.text.x = element_blank(),
axis.title = element_blank(),
strip.text = element_text(size = 10)) +
scale_fill_brewer(palette="Dark2") +
facet_wrap(~variable, scales = "free", ncol = 2)
Committing to the repo after publication is associated with more community engagement, more development activity, and more citations.
plt_data_commits_after_pub <- as.tbl(repo_data_main) %>%
select(commits, commit_authors, forks_count, subscribers_count,
watchers_count, num_citations_per_week_pmc_minus_2_years,
mean_commit_message_len, pct_commits_diff_author_committer,
num_non_committing_authors, commits_after_article_in_pubmed) %>%
filter(!is.na(commits_after_article_in_pubmed)) %>%
dplyr::rename(
`Total commits` = commits,
`Commit authors` = commit_authors,
`Total forks` = forks_count,
`Total subscribers` = subscribers_count,
`Total watchers` = watchers_count,
`PMC citations / week` = num_citations_per_week_pmc_minus_2_years,
`Commit message len` = mean_commit_message_len,
`Pct outside commits` = pct_commits_diff_author_committer,
`Outside cmt authors` = num_non_committing_authors,
`Commits after\npublication` = commits_after_article_in_pubmed
) %>%
melt(id.vars = "Commits after\npublication")
# Get smallest positive value of each variable so we can take logs
min_pos <- plt_data_commits_after_pub %>%
filter(value > 0) %>%
group_by(variable) %>%
dplyr::summarize(min_pos = min(value))
# Remove top outliers for plot
p_outlier <- 1 # 1 means no filtering for outliers
outlier_cutoff <- plt_data_commits_after_pub %>%
group_by(variable) %>%
dplyr::summarize(outlier_cutoff = quantile(value, probs = p_outlier, na.rm = T))
plt_data_commits_after_pub <- plt_data_commits_after_pub %>%
left_join(min_pos, by = "variable") %>%
left_join(outlier_cutoff, by = "variable")
# Replace 0's and NA's by minimum positive value
plt_data_commits_after_pub$value_pos <- apply(plt_data_commits_after_pub, 1, function(row) {
val <- as.numeric(row["value"])
mp <- as.numeric(row["min_pos"])
if(is.na(val)) mp
else max(val, mp)
})
plt_data_commits_after_pub <- plt_data_commits_after_pub %>%
filter(value_pos <= outlier_cutoff) %>%
select(`Commits after\npublication`, variable, value_pos)
ggplot(plt_data_commits_after_pub, aes(variable, value_pos)) +
geom_boxplot(aes(fill = `Commits after\npublication`)) +
facet_wrap(~variable, scales = "free", ncol = 3) +
scale_y_log10() +
theme_bw() +
theme(legend.text = element_text(size=10),
legend.title = element_text(size = 11),
axis.text.y = element_text(size = 10),
axis.text.x = element_blank(),
axis.title = element_blank(),
strip.text = element_text(size = 10))
plt_data_outside_contrib <- as.tbl(repo_data_all) %>%
select(forks_count, subscribers_count, watchers_count, num_non_committing_authors, is_high_profile) %>%
mutate(forks_count = forks_count + 1,
subscribers_count = subscribers_count + 1,
watchers_count = watchers_count + 1,
num_non_committing_authors = num_non_committing_authors + 1) %>%
dplyr::rename(
`Total forks + 1` = forks_count,
`Total subscribers + 1` = subscribers_count,
`Total watchers + 1` = watchers_count,
`Outside commit authors + 1` = num_non_committing_authors
) %>%
melt(id.vars = c("Outside commit authors + 1", "is_high_profile")) %>%
group_by(`Outside commit authors + 1`, is_high_profile, variable, value) %>%
dplyr::summarize(`Num repos` = n())
ggplot(plt_data_outside_contrib) +
geom_point(aes(size = `Num repos`,
x = `Outside commit authors + 1`,
y = value, col = is_high_profile)) +
geom_point(data = subset(plt_data_outside_contrib, is_high_profile),
aes(size = `Num repos`,
x = `Outside commit authors + 1`,
y = value, col = is_high_profile)) +
scale_color_manual(values=c(color_main, color_high_prof), labels = c("Main repos", "High profile repos")) +
theme_bw() +
guides(color = guide_legend(title = "Dataset")) +
theme(legend.text = element_text(size=12),
legend.title = element_text(size = 14),
axis.text = element_text(size = 12),
strip.text = element_text(size = 11)) +
facet_wrap(~variable, scales = "free", ncol = 3) +
scale_y_log10(breaks = c(1, 10, 100, 1000, 10000)) +
scale_x_log10()
plt_data_commit_authors <- as.tbl(repo_data_all) %>%
select(commits, mean_commits_per_month, consecutive_months_with_commits,
commit_span_days, mean_files_added_per_month, num_days_new_files_added,
consecutive_months_no_commits, is_high_profile) %>%
mutate(consecutive_months_no_commits = consecutive_months_no_commits + 1) %>%
dplyr::rename(
`Total commits` = commits,
`Mean commits/month` = mean_commits_per_month,
`Max cons. months with commits` = consecutive_months_with_commits,
`Project duration (days)` = commit_span_days,
`Mean new files per month` = mean_files_added_per_month,
`Days with new files added` = num_days_new_files_added,
`1 + max cons. months no commits` = consecutive_months_no_commits,
`High profile` = is_high_profile
) %>%
group_by(`Total commits`, `Mean commits/month`, `Max cons. months with commits`,
`Project duration (days)`, `Mean new files per month`,
`Days with new files added`, `1 + max cons. months no commits`, `High profile`) %>%
dplyr::summarize(`Num repos` = n()) %>%
melt(id.vars = c("Total commits", "High profile", "Num repos"))
ggplot(plt_data_commit_authors) +
geom_point(aes(x = `Total commits`,
y = value,
col = `High profile`,
size = `Num repos`)) +
geom_point(data = subset(plt_data_commit_authors, `High profile`),
aes(x = `Total commits`,
y = value,
col = `High profile`,
size = `Num repos`)) +
scale_color_manual(values=c(color_main, color_high_prof), labels = c("Main repos", "High profile repos")) +
theme_bw() +
guides(color = guide_legend(title = "Dataset")) +
theme(legend.text = element_text(size=12),
legend.title = element_text(size = 14),
axis.text = element_text(size = 12),
axis.title.y = element_blank(),
strip.text = element_text(size = 10)) +
facet_wrap(~variable, scales = "free", ncol = 3) +
scale_y_log10(breaks = c(1, 10, 100, 1000)) +
scale_x_log10(breaks = c(1, 10, 100, 1000, 10000))
## Warning: Removed 20 rows containing missing values (geom_point).
top_langs_as_header <- sapply(top_langs, format_lang_as_header)
lang_cols <- unname(c(sapply(top_langs_as_header, function(x) paste("num_files_", x, sep = "")),
sapply(top_langs_as_header, function(x) paste("mean_lines_code_", x, sep = ""))))
plt_data_langs <- data.frame(`Number of files` = integer(),
`Mean lines of code per file` = numeric(),
is_high_profile = logical(),
lang = character())
for(lang in top_langs_as_header) {
col_nf <- paste("num_files_", lang, sep = "")
col_loc <- paste("mean_lines_code_", lang, sep = "")
plt_data_langs <- rbind(plt_data_langs,
repo_data_all %>%
select(!!as.name(col_nf), !!as.name(col_loc), is_high_profile) %>%
mutate(lang = lang) %>%
rename(`Number of files` = !!as.name(col_nf),
`Mean lines of code per file` = !!as.name(col_loc)) %>%
filter(`Number of files` > 0 & `Mean lines of code per file` > 0))
}
ggplot(plt_data_langs) +
geom_point(aes(x = `Number of files`,
y = `Mean lines of code per file`,
col = is_high_profile)) +
geom_point(data = subset(plt_data_langs, is_high_profile),
aes(x = `Number of files`,
y = `Mean lines of code per file`,
col = is_high_profile)) +
scale_color_manual(values=c(color_main, color_high_prof), labels = c("Main repos", "High profile repos")) +
theme_bw() +
guides(color = guide_legend(title = "Dataset")) +
theme(legend.text = element_text(size=12),
legend.title = element_text(size = 14),
axis.text = element_text(size = 10),
strip.text = element_text(size = 10)) +
facet_wrap(~lang, scales = "free", ncol = 5) +
scale_y_log10(breaks = c(1, 10, 100, 1000, 10000)) +
scale_x_log10(breaks = c(1, 10, 100, 1000, 10000))
lang_cols <- unname(c(sapply(top_langs_as_header, function(x) paste("bytes_", x, sep = ""))))
plt_data_lang_bytes <- data.frame(bytes = integer(),
is_high_profile = logical(),
lang = character())
for(lang in top_langs_as_header) {
col_b <- paste("bytes_", lang, sep = "")
plt_data_lang_bytes <- rbind(plt_data_lang_bytes,
repo_data_all %>%
select(!!as.name(col_b), is_high_profile) %>%
mutate(lang = lang) %>%
rename(bytes = !!as.name(col_b)))
}
plt_data_lang_bytes$Dataset <- sapply(plt_data_lang_bytes$is_high_profile,
function(x) {
if(x) "High profile repos"
else "Main repos"
})
ggplot(plt_data_lang_bytes,
aes(x = lang, y = bytes / 1000000, fill = Dataset)) +
geom_bar(stat = "identity") +
scale_fill_manual(values=c(color_high_prof, color_main)) +
theme_bw() +
theme(axis.text = element_text(size = 12),
strip.text = element_text(size = 12),
axis.text.x = element_text(angle = 45, hjust = 1),
legend.position = "none") +
xlab("Language") +
ylab("Total megabytes of code") +
facet_wrap(~Dataset, scales = "free")
plt_data_years_by_topic <- repo_data_main %>%
select(first_commit, date_pubmed, contains("topic")) %>%
mutate(year_first_commit = year(first_commit),
year_pubmed = year(date_pubmed)) %>%
select(-first_commit, -date_pubmed) %>%
melt(id.vars = c("year_first_commit", "year_pubmed")) %>%
filter(value) %>%
group_by(year_first_commit, year_pubmed, variable) %>%
dplyr::summarize(num_repos = n()) %>%
filter(!is.na(year_first_commit) & !is.na(year_pubmed)) %>%
rename(Topic = variable)
plt_data_years_by_topic$Topic <- sapply(plt_data_years_by_topic$Topic, function(x) {
gsub("_", " ", gsub("topic_", "", gsub("RNA.seq", "RNA-seq", x)))
})
min_year <- min(c(plt_data_years_by_topic$year_first_commit, plt_data_years_by_topic$year_pubmed))
max_year <- max(c(plt_data_years_by_topic$year_first_commit, plt_data_years_by_topic$year_pubmed))
ggplot(plt_data_years_by_topic) +
geom_point(aes(x = year_first_commit,
y = year_pubmed,
size = num_repos)) +
theme_bw() +
theme(legend.text = element_text(size=12),
legend.title = element_text(size = 14),
axis.text = element_text(size = 12),
strip.text = element_text(size = 10)) +
facet_wrap(~Topic, scales = "fixed", ncol = 2) +
xlim(min_year, max_year) +
ylim(min_year, max_year)
plt_data_licenses <- repo_data_all %>%
select(license, is_high_profile)
plt_data_licenses$is_high_profile <- as.factor(plt_data_licenses$is_high_profile)
levels(plt_data_licenses$is_high_profile) <- c("Main repos", "High profile repos")
ggplot(plt_data_licenses, aes(x = license)) + geom_histogram(stat = "count") +
facet_wrap(~is_high_profile, scales = "free") +
theme_bw() +
theme(axis.text.y = element_text(size = 10),
axis.text.x = element_text(angle = 45, hjust = 1),
strip.text = element_text(size = 10)) +
ylab("Number of repos") +
xlab("License")
## Warning: Ignoring unknown parameters: binwidth, bins, pad
exec_method <- list_tabledata(project = proj_main,
dataset = ds_lang,
table = table_exec_method) %>%
mutate(language = tolower(language))
type_system <- list_tabledata(project = proj_main,
dataset = ds_lang,
table = table_type_system) %>%
mutate(language = tolower(language))
lang_features <- data.frame(language = tolower(top_langs),
lang_header = top_langs_as_header) %>%
left_join(exec_method, by = "language") %>%
left_join(type_system, by = "language")
## Warning: Column `language` joining factor and character vector, coercing
## into character vector
sum_lang <- function(prefix, keep_rows) {
sapply(lang_features$lang_header,
function(x) {
sum(repo_data_all[which(keep_rows),
paste(prefix, x, sep="")])
})}
lang_features$bytes_high_profile <- sum_lang("bytes_", repo_data_all$is_high_profile)
lang_features$bytes_main <- sum_lang("bytes_", !repo_data_all$is_high_profile)
lang_features$files_high_profile <- sum_lang("num_files_", repo_data_all$is_high_profile)
lang_features$files_main <- sum_lang("num_files_", !repo_data_all$is_high_profile)
lang_features$exec <- ""
lang_features$type <- ""
for(i in 1:nrow(lang_features)) {
# Execution method
interpreted <- isTRUE(lang_features[i, "interpreted"])
compiled <- isTRUE(lang_features[i, "compiled"])
if(interpreted && compiled) lang_features[i, "exec"] <- "Both"
else if(interpreted) lang_features[i, "exec"] <- "Interpreted"
else if(compiled) lang_features[i, "exec"] <- "Compiled"
else lang_features[i, "exec"] <- NA
# Type system
type <- NA
strength <- lang_features[i, "strength"]
system <- lang_features[i, "system"]
safety <- lang_features[i, "safety"]
if(!is.na(strength)) type <- capitalize(strength)
if(!is.na(system)) type <- paste(type, system)
if(!is.na(safety)) type <- paste(type, safety)
lang_features[i, "type"] <- type
}
plt_data_lang_features <- lang_features %>% select(exec, type, bytes_high_profile,
bytes_main, files_high_profile, files_main) %>%
filter(!is.na(exec) & !is.na(type)) %>%
melt(id.vars = c("exec", "type")) %>%
mutate(is_high_profile = grepl("high_profile", variable)) %>%
mutate(variable = gsub("_main", "", gsub("_high_profile", "", variable)))
plt_data_lang_features$var <- ""
for(i in 1:nrow(plt_data_lang_features)) {
variable <- plt_data_lang_features[i, "variable"]
high_prof <- plt_data_lang_features[i, "is_high_profile"]
var <- capitalize(variable)
if(high_prof) var <- paste(var, "- high profile repos")
else var <- paste(var, "- main repos")
plt_data_lang_features[i, "var"] <- var
}
plt_data_lang_features <- plt_data_lang_features %>%
select(exec, type, var, value) %>%
group_by(var) %>%
mutate(sum_var = sum(value)) %>%
ungroup() %>%
mutate(val_normalized = value / sum_var)
ggplot(plt_data_lang_features) +
geom_point(aes(x = exec,
y = type,
size = val_normalized)) +
theme_bw() +
theme(axis.text = element_text(size = 11),
strip.text = element_text(size = 11),
axis.title = element_text(size = 12),
axis.text.x = element_text(angle = 45, hjust = 1),
legend.position = "none") +
facet_wrap(~var, scales = "free", ncol = 2) +
xlab("Execution mode") +
ylab("Type system")
sessionInfo()
## R version 3.4.3 (2017-11-30)
## Platform: x86_64-apple-darwin15.6.0 (64-bit)
## Running under: macOS High Sierra 10.13.2
##
## Matrix products: default
## BLAS: /Library/Frameworks/R.framework/Versions/3.4/Resources/lib/libRblas.0.dylib
## LAPACK: /Library/Frameworks/R.framework/Versions/3.4/Resources/lib/libRlapack.dylib
##
## locale:
## [1] en_US.UTF-8/en_US.UTF-8/en_US.UTF-8/C/en_US.UTF-8/en_US.UTF-8
##
## attached base packages:
## [1] stats graphics grDevices utils datasets methods base
##
## other attached packages:
## [1] bindrcpp_0.2 jsonlite_1.5 Hmisc_4.1-1 Formula_1.2-2
## [5] survival_2.41-3 lattice_0.20-35 bigrquery_0.4.1 lubridate_1.7.1
## [9] ggplot2_2.2.1 dplyr_0.7.4 reshape2_1.4.3
##
## loaded via a namespace (and not attached):
## [1] splines_3.4.3 colorspace_1.3-2 htmltools_0.3.6
## [4] yaml_2.1.16 base64enc_0.1-3 rlang_0.1.6
## [7] pillar_1.0.1 foreign_0.8-69 glue_1.2.0
## [10] DBI_0.7 RColorBrewer_1.1-2 bindr_0.1
## [13] plyr_1.8.4 stringr_1.2.0 munsell_0.4.3
## [16] gtable_0.2.0 htmlwidgets_0.9 evaluate_0.10.1
## [19] labeling_0.3 latticeExtra_0.6-28 knitr_1.18
## [22] curl_3.1 htmlTable_1.11.2 Rcpp_0.12.14
## [25] acepack_1.4.1 openssl_0.9.9 scales_0.5.0
## [28] backports_1.1.2 checkmate_1.8.5 gridExtra_2.3
## [31] digest_0.6.13 stringi_1.1.6 grid_3.4.3
## [34] rprojroot_1.3-2 tools_3.4.3 magrittr_1.5
## [37] lazyeval_0.2.1 tibble_1.4.1 cluster_2.0.6
## [40] pkgconfig_2.0.1 Matrix_1.2-12 data.table_1.10.4-3
## [43] assertthat_0.2.0 rmarkdown_1.8 httr_1.3.1
## [46] rstudioapi_0.7 R6_2.2.2 rpart_4.1-11
## [49] nnet_7.3-12 compiler_3.4.3